Load libraries and data:
library(car)
library(tidyverse)
library(modelr)
library(GGally)
library(relaimpo)
library(lm.beta)
library(fastDummies)
houses <- read_csv("data/kc_house_data.csv")
Check for any missing values:
houses %>%
summarise_all(funs(sum(is.na(.))))
## Warning: `funs()` is deprecated as of dplyr 0.8.0.
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
Remove columns that are not useful:
houses_clean <- houses %>%
dplyr::select(-c(date, id, sqft_living15, sqft_lot15, zipcode))
names(houses_clean)
## [1] "price" "bedrooms" "bathrooms" "sqft_living"
## [5] "sqft_lot" "floors" "waterfront" "view"
## [9] "condition" "grade" "sqft_above" "sqft_basement"
## [13] "yr_built" "yr_renovated" "lat" "long"
Covert waterfront into a logical variable:
houses_clean <- houses_clean %>%
mutate(waterfront = ifelse(waterfront == 1, T, F))
houses_clean
Convert ‘yr_renovated’ into a ‘renovated’ logical variable:
houses_clean <- houses_clean %>%
mutate(renovated = ifelse(yr_renovated == 0, F, T)) %>%
dplyr::select(-yr_renovated)
houses_clean
‘Condition’ and ‘Grade’ are both categorical variables. To model the data accurately, it is important to create ‘dummy variables’ for both condition and grade, which are ‘true’ or ‘false’ only:
houses_clean %>%
summarise(min_grade = min(grade), max_grade = max(grade),
min_condition = min(condition), max_condition = max(condition))
houses_dummy <- houses_clean %>%
dummy_cols(select_columns = c("grade", "condition"), remove_first_dummy = T) %>%
dplyr::select(-c(condition, grade))
houses_dummy
alias(lm(price ~ ., data = houses_dummy))
## Model :
## price ~ bedrooms + bathrooms + sqft_living + sqft_lot + floors +
## waterfront + view + sqft_above + sqft_basement + yr_built +
## lat + long + renovated + grade_3 + grade_4 + grade_5 + grade_6 +
## grade_7 + grade_8 + grade_9 + grade_10 + grade_11 + grade_12 +
## grade_13 + condition_2 + condition_3 + condition_4 + condition_5
##
## Complete :
## (Intercept) bedrooms bathrooms sqft_living sqft_lot floors
## sqft_basement 0 0 0 1 0 0
## waterfrontTRUE view sqft_above yr_built lat long renovatedTRUE
## sqft_basement 0 0 -1 0 0 0 0
## grade_3 grade_4 grade_5 grade_6 grade_7 grade_8 grade_9 grade_10
## sqft_basement 0 0 0 0 0 0 0 0
## grade_11 grade_12 grade_13 condition_2 condition_3 condition_4
## sqft_basement 0 0 0 0 0 0
## condition_5
## sqft_basement 0
# alias = 'sqft_basement'
houses_trim <- houses_dummy %>%
dplyr::select(-sqft_basement)
houses_trim
alias(lm(price ~ ., data = houses_trim))
## Model :
## price ~ bedrooms + bathrooms + sqft_living + sqft_lot + floors +
## waterfront + view + sqft_above + yr_built + lat + long +
## renovated + grade_3 + grade_4 + grade_5 + grade_6 + grade_7 +
## grade_8 + grade_9 + grade_10 + grade_11 + grade_12 + grade_13 +
## condition_2 + condition_3 + condition_4 + condition_5
# no aliases found
First predictor
houses_trim_numeric <- houses_trim %>%
select_if(is.numeric)
houses_trim_nonnumeric <- houses_trim %>%
select_if(function(x) !is.numeric(x))
houses_trim_nonnumeric$price <- houses_trim$price
ggpairs(houses_trim_numeric)
ggpairs(houses_trim_nonnumeric)
Model 1
Price ~ Bedrooms
model1 <- lm(price ~ bedrooms, data = houses_trim)
model1
##
## Call:
## lm(formula = price ~ bedrooms, data = houses_trim)
##
## Coefficients:
## (Intercept) bedrooms
## 129802 121716
Summary
summary(model1)
##
## Call:
## lm(formula = price ~ bedrooms, data = houses_trim)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3506435 -203235 -66667 105049 6839901
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 129802 8932 14.53 <2e-16 ***
## bedrooms 121716 2554 47.65 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 349200 on 21611 degrees of freedom
## Multiple R-squared: 0.09508, Adjusted R-squared: 0.09504
## F-statistic: 2271 on 1 and 21611 DF, p-value: < 2.2e-16
Plot
par(mfrow = c(2, 2))
plot(model1)
Model 2
Price ~ sqft_living
model2 <- lm(price ~ sqft_living, data = houses_trim)
model2
##
## Call:
## lm(formula = price ~ sqft_living, data = houses_trim)
##
## Coefficients:
## (Intercept) sqft_living
## -43580.7 280.6
Summary
summary(model2)
##
## Call:
## lm(formula = price ~ sqft_living, data = houses_trim)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1476062 -147486 -24043 106182 4362067
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -43580.743 4402.690 -9.899 <2e-16 ***
## sqft_living 280.624 1.936 144.920 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 261500 on 21611 degrees of freedom
## Multiple R-squared: 0.4929, Adjusted R-squared: 0.4928
## F-statistic: 2.1e+04 on 1 and 21611 DF, p-value: < 2.2e-16
Plot
par(mfrow = c(2, 2))
plot(model2)
Model 3
Price ~ Condition
model3 <- lm(price ~ condition_2 + condition_3 + condition_4 + condition_5, data = houses_trim)
model3
##
## Call:
## lm(formula = price ~ condition_2 + condition_3 + condition_4 +
## condition_5, data = houses_trim)
##
## Coefficients:
## (Intercept) condition_2 condition_3 condition_4 condition_5
## 334432 -7145 207581 186769 277986
Summary
summary(model3)
##
## Call:
## lm(formula = price ~ condition_2 + condition_3 + condition_4 +
## condition_5, data = houses_trim)
##
## Residuals:
## Min 1Q Median 3Q Max
## -502418 -217013 -87013 102800 7178800
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 334432 66803 5.006 5.59e-07 ***
## condition_2 -7144 72395 -0.099 0.92139
## condition_3 207581 66875 3.104 0.00191 **
## condition_4 186769 66979 2.788 0.00530 **
## condition_5 277986 67390 4.125 3.72e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 365900 on 21608 degrees of freedom
## Multiple R-squared: 0.006878, Adjusted R-squared: 0.006694
## F-statistic: 37.41 on 4 and 21608 DF, p-value: < 2.2e-16
Plot
par(mfrow = c(2, 2))
plot(model3)
Model 4
Price ~ Waterfront
model4 <- lm(price ~ waterfront, data = houses_trim)
model4
##
## Call:
## lm(formula = price ~ waterfront, data = houses_trim)
##
## Coefficients:
## (Intercept) waterfrontTRUE
## 531564 1130312
summary(model4)
##
## Call:
## lm(formula = price ~ waterfront, data = houses_trim)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1376876 -211564 -81564 108436 7168436
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 531564 2416 220.00 <2e-16 ***
## waterfrontTRUE 1130312 27822 40.63 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 353900 on 21611 degrees of freedom
## Multiple R-squared: 0.07095, Adjusted R-squared: 0.07091
## F-statistic: 1650 on 1 and 21611 DF, p-value: < 2.2e-16
Model 5
Price ~ bedrooms + sqft_living
model5 <- lm(price ~ bedrooms + sqft_living, data = houses_trim)
summary(model5)
##
## Call:
## lm(formula = price ~ bedrooms + sqft_living, data = houses_trim)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1650867 -143866 -23143 102344 4179850
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 79469.359 6604.764 12.03 <2e-16 ***
## bedrooms -57066.759 2308.223 -24.72 <2e-16 ***
## sqft_living 313.949 2.337 134.31 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 257800 on 21610 degrees of freedom
## Multiple R-squared: 0.5068, Adjusted R-squared: 0.5068
## F-statistic: 1.11e+04 on 2 and 21610 DF, p-value: < 2.2e-16